Here we perform some basic EDA operations.
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import feather
from pandas_profiling import ProfileReport
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sn
import psutil
from translate import Translator
os.chdir('..')
os.chdir('data')
os.getcwd()
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
For our research, we are looking at the location of 'eerste afnemers', the receivers of secondary resources. However, there is a problem - some locations of 'eerste afnemers' actually represent the location of the offices of the reusing company, rather than the actual location of reuse. For example, for the process of using rubble as a foundation for roads, the eerste afnemers' locations are the locations of the contractors' offices, not the location of the roads that are being filled.
Because of this, we need to talk to experts from LMA to determine which flows are less likely to have accurate 'eerste afnemers' locations (such as rubble), and which could be more accurate (metal, plastic). The purpose of this section is therefore to prepare a list of major flow types from the LMA dataset, which we can use to discuss with LMA experts. The types will be categorized by VMC (processing type), SBI (industry type) and EWC/GNC (material type).
# clean df
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
df = df[['eaNaam', 'gnc', 'ewc', 'vmc', 'sbi', 'kg']]
df.sbi = df.sbi.replace('nan', '0')
df = df.replace([None, 'nan', '0'], '--')
# change codes to chapters
df.gnc = df.gnc.str[:2]
df.ewc = df.ewc.str[:2]
df.vmc = df.vmc.str[:1]
# sbi chapters
df.sbi = df.sbi.str.split(',')
def chap(sbis):
newSbis = []
for sbi in sbis:
newSbis.append(sbi[:2])
return list(set(newSbis))
df.sbi = df.sbi.map(lambda x: chap(x))
df['sbiLen'] = df.sbi.map(lambda x: len(x))
df.sbi = df.sbi.map(lambda x: ','.join(x))
There are lots of different types of SBI chapters because many companies have multiple sbi codes, creating many different combinations
# flow types
ft = df.groupby(['gnc', 'ewc', 'vmc', 'sbi']).sum().reset_index().sort_values('kg', ascending=False)
# # add chapter text
# ewcC = feather.read_dataframe('classification/ewc.feather')
# ft = pd.merge(ft, ewcC, how='left', on='ewc')
# vmcC = feather.read_dataframe('classification/vmc.feather')
# ft = pd.merge(ft, vmcC, how='left', on='vmc')
# gncC = feather.read_dataframe('classification/gnc.feather')
# gncC = gncC[['Code', 'descDetail']]
# gncC.rename(columns={'Code': 'gnc', 'descDetail': 'gncDesc'}, inplace=True)
# ft = pd.merge(ft, gncC, how='left', on='gnc')
ft['type'] = 'gnc:' + ft.gnc + ' ewc:' + ft.ewc + ' vmc:' + ft.vmc + ' sbi:' + ft.sbi
ft.head()
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(ft.kg.head(20))
ax[0].legend(ft.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(ft.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(ft.kg)))
plt.show()
Here, we tried to categorize the afgifte flows using the gnc, ewc, vmc, and sbi. As seen above, there are 1864 flow types, which is a lot. The reason why there are so many flow types is because of the SBI codes - many companies have multiple sbi codes, as many as thirty!
print('how many sbi codes do companies have in the dataset?')
for i in sorted(df.sbiLen.unique()):
numCom = len(df[df.sbiLen == i])
print('{} companies have {} sbi codes'.format(numCom, i))
sbiLen = df.groupby('sbiLen').sum().sort_values('kg', ascending=False).reset_index()
fig, ax = plt.subplots(1,1,figsize=(9*2,5))
ax.pie(sbiLen.kg)
ax.legend(sbiLen.sbiLen, loc='center left', bbox_to_anchor=(1,0.5))
ax.set_title('% waste (in kg) associated with x number of sbis')
plt.show()
df[df.sbiLen == 30]
# 30 associated sbi codes of gemeente amsterdam
# maker pd.series of the 30 sbi codes
ams = df[df.sbiLen == 30].sbi.iloc[0]
ams = ams.split(',')
ams = pd.DataFrame(ams, columns=['sbi'])
# merge with sbi chapters
sbiChap = feather.read_dataframe('classification/sbi_Headings.feather')
ams = pd.merge(ams, sbiChap[['sbi', 'sbiDesc']], how='left', on='sbi')
# display
ams.head(10)
For companies with multiple SBI codes, their sbi code becomes '--'
dfOneSbi = df
dfOneSbi.loc[df.sbiLen > 1, 'sbi'] = '--'
dfOneSbi = dfOneSbi.groupby(['gnc', 'ewc', 'vmc', 'sbi']).sum().kg.reset_index()
dfOneSbi['type'] = 'gnc:' + dfOneSbi.gnc + ' ewc:' + dfOneSbi.ewc + ' vmc:' + dfOneSbi.vmc + ' sbi:' + dfOneSbi.sbi
dfOneSbi = dfOneSbi.sort_values('kg', ascending=False)
num = len(dfOneSbi.sbi.unique())
print('number of unique sbi chapters: {}'.format(num))
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(dfOneSbi.kg.head(20))
ax[0].legend(dfOneSbi.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(dfOneSbi.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(dfOneSbi.kg)))
plt.show()
Now let's try to use sbi section headers to categorize the flows.
# read sbiChap, which includes chapter headings and which section each chapter belongs to.
sbiChap = feather.read_dataframe('classification/sbi_Chapters.feather')
sbiChap = sbiChap[['section', 'sbi', 'sbiDesc']]
# make df with sbi section heads
dfSbiSec = dfOneSbi.copy()
dfSbiSec.reset_index(inplace=True, drop=True)
dfSbiSec['sbiSec'] = pd.merge(dfSbiSec.sbi, sbiChap, how='left', on='sbi').section
dfSbiSec.sbiSec.fillna('--', inplace=True)
# groupby
dfSbiSec = dfSbiSec.groupby(['gnc', 'ewc', 'vmc', 'sbiSec']).sum().reset_index().sort_values('kg', ascending=False)
dfSbiSec['type'] = 'gnc:' + dfSbiSec.gnc + ' ewc:' + dfSbiSec.ewc + ' vmc:' + dfSbiSec.vmc + ' sbi:' + dfSbiSec.sbiSec
print('number of unique sbi sections: {}'.format(len(dfSbiSec.sbiSec.unique())))
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(dfSbiSec.kg.head(20))
ax[0].legend(dfSbiSec.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(dfSbiSec.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(dfSbiSec.kg)))
plt.show()
# make copy for new df
dfGncSec = dfSbiSec.copy()
dfGncSec.drop(labels=['type'], axis=1, inplace=True)
dfGncSec.reset_index(inplace=True, drop=True)
# add gnc section headers
gncSec = feather.read_dataframe('classification/gnc_Headings.feather')
dfGncSec['gncSec'] = pd.merge(dfGncSec.gnc, gncSec[['gnc', 'section']], how='left', on='gnc').section
dfGncSec = dfGncSec[['gnc','gncSec', 'ewc', 'vmc', 'sbiSec', 'kg']]
dfGncSec.gncSec.replace(np.NaN, '--', inplace=True)
# groupby
dfGncSec = dfGncSec.groupby(['gncSec', 'ewc', 'vmc', 'sbiSec']).sum().reset_index().sort_values('kg', ascending=False)
dfGncSec['type'] = 'gnc:' + dfGncSec.gncSec + ' ewc:' + dfGncSec.ewc + ' vmc:' + dfGncSec.vmc + ' sbi:' + dfGncSec.sbiSec
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(dfGncSec.kg.head(20))
ax[0].legend(dfGncSec.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(dfGncSec.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(dfGncSec.kg)))
plt.show()
Without the sbis, there are less types - only 118 in total, see below.
ftNoSbi = ft.groupby(['gnc', 'ewc', 'vmc']).sum().kg.reset_index().sort_values('kg', ascending=False)
ftNoSbi['type'] = 'gnc:' + ftNoSbi.gnc + ' ewc:' + ftNoSbi.ewc + ' vmc:' + ftNoSbi.vmc
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(ftNoSbi.kg.head(20))
ax[0].legend(ftNoSbi.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(ftNoSbi.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(ftNoSbi.kg)))
plt.show()
# groupby to create pie chart (matplotlib)
# gnc
gnc = ft.groupby('gnc').sum().reset_index()
gnc = gnc.sort_values('kg', ascending=False)
gncLen = len(gnc)
gnc.loc[gnc['kg'] < 150000000, 'gnc'] = 'other'
gnc = gnc.groupby('gnc').sum().reset_index().sort_values('kg', ascending=False)
gnc = gnc[gnc.gnc != '--']
# ewc
ewc = ft.groupby('ewc').sum().reset_index()
ewc = ewc.sort_values('kg', ascending=False)
ewcLen = len(ewc)
ewc.loc[ewc['kg'] < 800000000, 'ewc'] = 'other'
ewc = ewc.groupby('ewc').sum().reset_index().sort_values('kg', ascending=False)
ewc = ewc[ewc.ewc != '--']
# vmc
vmc = ft.groupby('vmc').sum().reset_index()
vmc = vmc.sort_values('kg', ascending=False)
vmcLen = len(vmc)
# sbi
sbi = ft.groupby('sbi').sum().kg.reset_index().sort_values('kg', ascending=False)
sbiLen = len(sbi)
sbi.loc[sbi['kg'] < 161052464, 'sbi'] = 'other'
sbi = sbi.groupby('sbi').sum().reset_index().sort_values('kg', ascending=False)
# merge gnc and ewc columns to create df of material (mat for short)
gnc['mat'] = 'gnc' + gnc.gnc
ewc['mat'] = 'ewc' + ewc.ewc
mat = pd.concat([gnc, ewc])
mat.drop(labels=['gnc', 'ewc', 'sbiLen'], inplace=True, axis=1)
mat = mat[['mat', 'kg']]
mat.sort_values('kg', ascending=False, inplace=True)
fig, ax = plt.subplots(1,3,figsize=(8*3,6))
ax[0].pie(mat.kg)
ax[0].legend(mat.mat, loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('gnc/ewc')
ax[1].pie(vmc.kg)
ax[1].legend(vmc.vmc, loc='center left', bbox_to_anchor=(1,0.5))
ax[1].set_title('vmc')
ax[2].pie(sbi.kg)
ax[2].legend(sbi.sbi, loc='center left', bbox_to_anchor=(1,0.5))
ax[2].set_title('sbi')
# display
print('distribution of gnc, ewc, and vmc codes in afgifte dataset by weight')
plt.show()
print('# unique gnc codes: {}'.format(gncLen))
print('# unique ewc codes: {}'.format(ewcLen))
print('# unique vmc codes: {}'.format(vmcLen))
print('# unique sbi codes: {}'.format(sbiLen))
So far, we've tried to categorized the flows in the following ways:
I will pick the third categorization method and add descriptions to it. (The fourth method wasn't chosen because using GNC sections turned out to be too vague.) With this, we can work with LMA experts to identify the flow types with locations that represent the real location of reuse. If it turns out that SBI codes don't really make a difference, then I will move on to the last categorization method, which doesn't include SBI codes.
# ADD DESCRIPTIONS
# read classification files
gncDesc = feather.read_dataframe('classification/gnc_Headings.feather')
ewcDesc = feather.read_dataframe('classification/ewc.feather')
vmcDesc = feather.read_dataframe('classification/vmc.feather')
sbiDesc = feather.read_dataframe('classification/sbi_Headings.feather')
# make copy of dfGncSec (dfl stands for df for lma)
dfl = dfSbiSec.copy()
# add descriptions
dfl = pd.merge(dfl, gncDesc[['gnc', 'gncDesc']], how='left', on='gnc') # gnc
dfl = pd.merge(dfl, ewcDesc, how='left', on='ewc') # ewc
dfl = pd.merge(dfl, vmcDesc, how='left', on='vmc') # vmc
dfl = pd.merge(dfl, sbiDesc[['sbi', 'sbiDesc']], how='left', left_on='sbiSec', right_on='sbi') # sbi
# rearrange columns
dfl = dfl[['gnc', 'gncDesc', 'ewc', 'ewcDesc', 'vmc', 'vmcDesc', 'sbiSec', 'sbiDesc', 'kg']]
# remove np.NaN
dfl.replace(np.NaN, '--', inplace=True)
# display all flow types with descriptions
dfl.head(10)
# make different flow types
dfNoSbi = dfl.groupby(['gnc', 'gncDesc', 'ewc', 'ewcDesc', 'vmc', 'vmcDesc']).sum().sort_values('kg', ascending=False).reset_index() # flow types without sbi
dfMat = dfl.groupby(['gnc', 'gncDesc', 'ewc', 'ewcDesc']).sum().sort_values('kg', ascending=False).reset_index() # material flow types (ewc, gnc)
dfPro = dfl.groupby(['vmc', 'vmcDesc']).sum().sort_values('kg', ascending=False).reset_index() # processing flow types (vmc)
import xlsxwriter
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('lma/flowTypes.xlsx', engine='xlsxwriter')
# save as excel file with multiple sheets
dfl.to_excel(writer, sheet_name='flowTypes_allCodes')
dfNoSbi.to_excel(writer, sheet_name='flowTypes_noSbi')
dfMat.to_excel(writer, sheet_name='flowTypes_mat')
dfPro.to_excel(writer, sheet_name='flowTypes_pro')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
There seem to be some combinations of material & processing codes (e.g. reused construction and demolition waste) that are associated with all SBI codes. This means that all industries, from financial services to construction companies, are involved in reusing C&D waste (see example below).
Not all flow types are associated with multiple SBIs, and this section will further explain this.
row = dfl.iloc[2]
mask = (dfl.gnc == row.gnc) & (dfl.ewc == row.ewc) & (dfl.vmc == row.vmc)
dfl[mask]
# finding number of sbis associated for each flow type (flow type = material + processing type)
# count number of sbis associated for each flow type
def numSbi(row):
mask = (dfl.gnc == row.gnc) & (dfl.ewc == row.ewc) & (dfl.vmc == row.vmc)
return len(dfl[mask])
dfNoSbi['numSbi'] = dfNoSbi.apply(lambda row: numSbi(row), axis=1)
# number of flow types with x number of associated sbis
numSbi = dfNoSbi.groupby('numSbi').count().gnc.reset_index().sort_values('numSbi')
numSbi.rename(columns={'gnc': 'count'}, inplace=True)
# kg of waste with x number of associated sbis
kgSbi = dfNoSbi.groupby('numSbi').sum().sort_values('kg', ascending=False).reset_index()
# pie chart explaining number of SBIs associated with major flows
fig, ax = plt.subplots(1,2,figsize=(9*2,6))
# rows with x number of associated sbis (by count)
ax[0].pie(numSbi['count'])
ax[0].legend(numSbi.numSbi, loc='center left', bbox_to_anchor=(1,0.5), title='# associated sbis')
ax[0].set_title('% of flow types with x number of associated sbis')
# kg of waste with x number of associated sbis (by weight)
ax[1].pie(kgSbi.kg)
ax[1].legend(kgSbi.numSbi, loc='center left', bbox_to_anchor=(1,0.5), title='# associated sbis')
ax[1].set_title('kg of waste types with x number of associated sbis')
plt.show()
def asSbi(gnc, ewc, vmc):
mask = (dfl.gnc == gnc) & (dfl.ewc == ewc) & (dfl.vmc == vmc)
return dfl[mask]
asSbi('--', '17', 'B') # flow type with 20 SBIs: reused C&D waste
asSbi('23', '--', 'B') # flow type with 1 sbi: reused food waste
asSbi('--', '17', 'E') # flow type with 2 sbis: composted c&d waste
asSbi('--', '15', 'D') # flow type with 3 sbis: mechanically treated waste packaging
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
# turn sbi columns into list
df.sbi = df.sbi.str.split(',')
df.sbiDesc = df.sbiDesc.str.split(',')
# explode sbi columns
df1 = df.explode('sbi')
df1.sbiDesc = df.sbiDesc.explode()
df = df1
SBI codes related to 'making': 01, 10-33, 41-43, 56, 95
Details: 01-Agriculture, 10-food products, 11-beverages, 12-tobacco, 13-textiles, 14-wearing apparel, 15-leather, 16-wood, 17-paper, 18-printing, 19-petroleum, 20-chemicals, 21-pharmacutical, 22-rubber and plastic, 23-nonmetallic mineral, 24-metals, 25-metal products, 26-electronics, 27-electrical equipment, 28-machinery, 29-vehicles, 30-transport equipment, 31-furniture, 32-other, 33-repair of machinery, 41-43-construction, 56-catering, 95-repair of computers and consumer goods
# SELECTING 'MAKING' ROWS
# create list of 'making' sbis
making_sbis = ['01']
for i in range(10, 34):
making_sbis.append(str(i))
for i in range(41, 44):
making_sbis.append(str(i))
making_sbis.append('56')
making_sbis.append('95')
# select and group rows
df_making = df[df.sbi.str.startswith(tuple(making_sbis))]
df_making = df_making.groupby(['verID', 'herNaam', 'herStraat', 'herPostcode', 'herPlaats', 'eaNaam',
'eaAddress', 'eaPostcode', 'eaLand', 'gnc', 'gnDesc', 'vmc', 'vmcDesc',
'year', 'sbi', 'sbiDesc']).sum().reset_index()
df_making = df_making.groupby(['verID', 'herNaam', 'herStraat', 'herPostcode', 'herPlaats', 'eaNaam',
'eaAddress', 'eaPostcode', 'eaLand', 'gnc', 'gnDesc', 'vmc', 'vmcDesc',
'year', 'kg']).aggregate(lambda x: list(x)).reset_index()
# stats and display
print('Rows with multiple SBIs: {}%'.format(round(len(df_making[df_making.sbi.map(len)>=2]) / len(df_making) * 100)))
df_making[df_making.sbi.map(len)>=2].head(3)
# what % of eerste afnemers are manufacturers? (by count)
num_makers = len(df_making.eaNaam.unique())
num_total = len(df.eaNaam.unique())
maker_bv = pd.DataFrame({
'type': ['makers', 'non-makers'],
'num': [num_makers, num_total - num_makers]
})
# display
print('maker eerste afnemers take up {}% of total number of companies'.format(round(num_makers / num_total * 100, 1)))
fig = px.pie(maker_bv, values='num', names='type', title='Making vs non-making companies (by count)', color='type',
color_discrete_map={'makers': '#f55142', 'non-makers': 'lightgrey'})
fig.show()
# what % of eerste afnemers are manufacturers? (by weight)
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
weight_makers = df_making.kg.sum()
weight_total = df.kg.sum()
maker_kg = pd.DataFrame({
'type': ['makers', 'non-makers'],
'num': [weight_makers, weight_total - weight_makers]
})
# display
print('making eerste afnemers take up {}% of total weight (kg) of waste-to-resource flows'.format(round(weight_makers / weight_total * 100, 1)))
fig = px.pie(maker_kg, values='num', names='type', title='Making vs non-making companies (by weight)', color='type',
color_discrete_map={'makers': '#f55142', 'non-makers': 'lightgrey'})
fig.show()
SBI codes related to 'making':
Details: 01-Agriculture, 10-food products, 11-beverages, 12-tobacco, 13-textiles, 14-wearing apparel, 15-leather, 16-wood, 17-paper, 18-printing, 19-petroleum, 20-chemicals, 21-pharmacutical, 22-rubber and plastic, 23-nonmetallic mineral, 24-metals, 25-metal products, 26-electronics, 27-electrical equipment, 28-machinery, 29-vehicles, 30-transport equipment, 31-furniture, 32-other, 33-repair of machinery, 41-43-construction, 56-catering, 95-repair of computers and consumer goods
# create list of 'making' sbis: food, manufacturing, construction, and repair
food_sbis = ['01', '10', '11', '56']
manufacturing_sbis = ['95']
for i in range(10, 34):
manufacturing_sbis.append(str(i))
construction_sbis=[]
for i in range(41, 44):
construction_sbis.append(str(i))
repair_sbis = ['331', '45', '95']
# select and group rows
df_food = df[df.sbi.str.startswith(tuple(food_sbis))]
df_food_kg = df_food.kg.sum()
df_manufacturing = df[df.sbi.str.startswith(tuple(manufacturing_sbis))]
df_manufacturing_kg = df_manufacturing.kg.sum()
df_construction = df[df.sbi.str.startswith(tuple(construction_sbis))]
df_construction_kg = df_construction.kg.sum()
df_repair = df[df.sbi.str.startswith(tuple(repair_sbis))]
df_repair_kg = df_repair.kg.sum()
maker_kg = pd.DataFrame({
'type': ['food', 'manufacturing', 'construction', 'repair'],
'num': [df_food_kg, df_manufacturing_kg, df_construction_kg, df_repair_kg]
})
# display
fig = px.pie(maker_kg, values='num', names='type', title='Types of making companies (by weight)')
fig.show()
df_food.gnc.unique()
df = feather.read_dataframe('lma/af_2019-2020_ewc-and-gnc.feather')
gnc_kg = df[df.ewc.isna()].kg.sum()
ewc_kg = df[df.gnc.isna()].kg.sum()
pie_kg = pd.DataFrame({
'type': ['waste treatment', 'waste to resource'],
'num': [ewc_kg, gnc_kg]
})
# display
fig = px.pie(pie_kg, values='num', names='type', title='waste treatment vs waste to resource (by weight)')
fig.show()
df = feather.read_dataframe('lma/af_2019-2020_ewc-and-gnc.feather')
df_ewc = df[~df.ewc.isna()]
df_ewc = df_ewc.groupby(['vmc', 'vmcDesc']).sum().kg.reset_index()
df_ewc.loc[df_ewc['kg'] < 39965897, 'vmcDesc'] = 'other (not top 10)'
# display
fig = px.pie(df_ewc, values='kg', names='vmcDesc', title='processing methods (by weight, for waste treatment flows)')
fig.show()
df = feather.read_dataframe('lma/af_2019-2020_ewc-and-gnc.feather')
df_gnc = df[df.ewc.isna()]
df_gnc = df_gnc.groupby(['vmc', 'vmcDesc']).sum().kg.reset_index()
df_gnc.loc[df_gnc['kg'] < 39965897, 'vmcDesc'] = 'other (not top 10)'
# display
fig = px.pie(df_gnc, values='kg', names='vmcDesc', title='processing methods (by weight, for waste-to-resource flows)')
fig.show()